library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load and prepare data


Load dataset (preprocessing code in 20_04_07_create_dataset.html)

# Clusterings created by WGCNA
clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)

# Dataset created with 20_04_07_create_dataset.html
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
dataset$Module = clusterings$Module

# Update gene scores to new SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
gene_scores = dataset %>% mutate(ID = rownames(.)) %>% 
              left_join(SFARI_genes %>% dplyr::select(ID, `gene-score`)) %>% pull(`gene-score`)
rownames_dataset = rownames(dataset)
dataset = dataset %>% mutate(gene.score = gene_scores)
rownames(dataset) = rownames_dataset

# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)

rm(getinfo, mart, rownames_dataset, GO_annotations, gene_scores)


Gene filtering:


  • Remove genes without cluster (Module=gray)
rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character

cat(paste0('Removing ', sum(dataset$Module=='gray'), ' genes without cluster'))
## Removing 113 genes without cluster
new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor))


Variable changes:


  • Using Module Membership variables instead of binary module membership

  • Not including p-value variables

  • Including a new variable with the absolute value of GS

  • Removing information from gray module (unclassified genes)

  • Objective variable: Binary label indicating if it’s in the SFARI dataset or not

new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=!is.na(gene.score)) %>%
              dplyr::select(-gene.score)

rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']

rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
cat(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables'))
## The final dataset contains 16034 observations and 58 variables
rm(new_dataset)



Exploratory Analysis


PCA of Variables

The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS in the middle of both groups

mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))



plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
         scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
         ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)


PCA of Samples

  • The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module

  • Mean Expression doesn’t seem to play an important role

  • I don’t know what the 2nd PC is capturing

# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)



Dividing samples into Training and Test Sets


4.91% of the observations are positive. This can be a problem when training the classification model, so the samples in the training set should be balanced between classes before the model is trained.

table_info = dataset %>% apply_labels(SFARI = 'SFARI')

cro(table_info$SFARI)
 #Total 
 SFARI 
   FALSE  15247
   TRUE  787
   #Total cases  16034
rm(table_info)

To divide our samples into training and test sets:

set.seed(123)

sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>% 
                left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                          by = 'ID') %>% 
                mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

train_idx = createDataPartition(sample_scores$gene.score, p = 0.7, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]


# # Validation Set
# aux_dataset = dataset[-train_idx,]
# validation_idx = createDataPartition(sample_scores$gene.score[-train_idx], p = 0.5, list = FALSE)
# validation_set = aux_dataset[validation_idx,]
# 
# # Test Set
# test_set = aux_dataset[-valudation_idx,]


rm(sample_scores, train_idx)


Label distribution in training set

To fix this class imbalance, we are going to use SMOTE, an over-sampling technique that over-samples the minority class (SFARI Genes) by creating synthetic examples, in the training set

cro(train_set$SFARI)
 #Total 
 train_set$SFARI 
   FALSE  10673
   TRUE  553
   #Total cases  11226

Labels distribution in test set

This set is used just to evaluate how well the model performs, so the class imbalance is not a problem here

cro(test_set$SFARI)
 #Total 
 test_set$SFARI 
   FALSE  4574
   TRUE  234
   #Total cases  4808


Logistic Regression


Train model

# https://shiring.github.io/machine_learning/2017/04/02/unbalanced
# https://topepo.github.io/caret/using-your-own-model-in-train.html#Illustration5


train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)

smote_over_sampling = trainControl(method = 'repeatedcv', number = 10, repeats = 10, verboseIter = FALSE, 
                                   classProbs = TRUE, savePredictions = 'final', 
                                   summaryFunction = twoClassSummary, sampling = 'smote')

# Using ROC as metric because it doesn't depend on the threshold
fit = caret::train(SFARI ~ ., data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
#a = caret::thresholder(fit, threshold = seq(0.05, 0.95, by = 0.1), statistics = 'all')

rm(smote_over_sampling)


Performance


The model has an AUC of 0.6833

But the features are strongly correlated, which inflates the standard error of the coefficients, making them no longer interpretable, so perhaps it would be better to use another model

# VIF
plot_data = data.frame('Feature' = car::vif(fit$finalModel) %>% sort %>% names,
                       'VIF' = car::vif(fit$finalModel) %>% sort %>% unname) %>%
            mutate(outlier = VIF>10)

plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + 
              scale_y_log10() + geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + 
              xlab('Model Features') + ggtitle('Variance Inflation Number for each Feature') + theme_minimal() +
              theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))

rm(plot_data)

Correlation plot

# Correlation plot
corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6, 
               tl.pos = 'l', tl.col = '#666666')

rm(getinfo, mart, clusterings)


Possible solutions to Multicollinearity:


  1. Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal

  2. Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study

  3. Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression

  4. Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again



Ridge Regression


Notes:

### DEFINE FUNCTIONS

create_train_test_sets = function(p, seed){
  
  # Get SFARI Score of all the samples so our train and test sets are balanced for each score
  sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
                  left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                             by = 'ID') %>% 
                  mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

  set.seed(seed)
  train_idx = createDataPartition(sample_scores$gene.score, p = p, list = FALSE)
  
  train_set = dataset[train_idx,]
  test_set = dataset[-train_idx,]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
}



run_model = function(p, over_sampling_fold, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, seed)
  train_set = train_test_sets[['train_set']]
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(seed)
  smote_over_sampling = trainControl(method = 'repeatedcv', number = over_sampling_fold, repeats = 10,
                                     verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                     summaryFunction = twoClassSummary, sampling = 'smote')
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = smote_over_sampling, metric = 'ROC',
              tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type = 'prob')
  preds = data.frame('ID' = rownames(test_set), 'prob' = predictions$SFARI) %>% mutate(pred = prob>0.5)

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  # Extract coefficients from features
  coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
  
  return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1, 
              'AUC' = AUC, 'preds' = preds, 'coefs' = coefs))
}


### RUN MODEL

# Parameters
p = 0.75
over_sampling_fold = 10

n_iter = 20
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, over_sampling_fold, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  prec = c(prec, model_output[['prec']])
  rec = c(rec, model_output[['rec']])
  F1 = c(F1, model_output[['F1']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  coefs$coef = coefs$coef + model_output[['coefs']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% 
                                                                      preds$ID, c('prob','pred','n')] +
                                                                    update_preds
}

coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)

rm(p, over_sampling_fold, seeds, update_preds, create_train_test_sets, run_model)


To summarise in a single value the predictions of the models:

test_set = predictions %>% filter(n>0) %>% left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  12083 3119   15202
   TRUE  413 371   784
   #Total cases  12496 3490   15986
rm(conf_mat)


Accuracy: Mean = 0.7754 SD = 0.012


Precision: Mean = 0.1059 SD = 0.0074


Recall: Mean = 0.4846 SD = 0.0303


F1 score: Mean = 0.1738 SD = 0.0112


ROC Curve: Mean = 0.6936 SD = 0.0166

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')


Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Coefficients


MTcor and absGS have a very small coefficient and Gene Significance has a negative coefficient

gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% left_join(assigned_module, by ='ID') %>%
                 mutate(Module = gsub('#','',Module))

coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% left_join(gene_corr_info, by = c('feature' = 'Module')) %>% 
            dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% summarise('SFARI_perc' = mean(SFARI)) %>%
            arrange(desc(coef))

kable(coef_info %>% dplyr::select(feature, coef) %>% rename('Feature' = feature, 'Coefficient' = coef),
      align = 'cc', caption = 'Regression Coefficients')
Regression Coefficients
Feature Coefficient
00A8FF 0.9006363
F8766D 0.6120119
EE67EC 0.6041166
8195FF 0.5563650
B79F00 0.5540209
00BECF 0.4445864
BF80FF 0.3579521
00BF74 0.3470143
00C083 0.3418488
DB8E00 0.3256616
FA62DA 0.2722904
619CFF 0.2622649
00ADFA 0.2535136
00B9E3 0.2524223
FE61CF 0.2457656
00BFC4 0.2399300
00C0B9 0.2224579
00BC50 0.2199474
2EA2FF 0.2127274
73B000 0.1998405
EE8045 0.1891572
FC727E 0.1859399
FF66AA 0.1681994
D39200 0.1597731
A0A600 0.1330359
CA9700 0.1251267
00BBDA 0.0923871
00C19F 0.0806976
FE6D8D 0.0205348
DB72FB 0.0150872
MTcor -0.0013881
FF699C -0.0094331
ACA300 -0.0455853
GS -0.0456309
CE79FF -0.0578505
998EFF -0.0582525
absGS -0.0940893
84AD00 -0.1046840
00C092 -0.1222770
00B80F -0.1243828
F564E3 -0.1375280
93AA00 -0.1426257
FF61C3 -0.1524681
F37B5A -0.1622455
00BD63 -0.1876835
E56CF4 -0.2038396
FF63B7 -0.2043036
AE87FF -0.2088116
E88526 -0.2285037
41B500 -0.2568549
00BA38 -0.3058564
5EB300 -0.3291940
00B6EC -0.4453079
E28900 -0.4997283
00B2F4 -0.5128669
Intercept -0.5368421
00C1AC -0.5392961
C19B00 -0.5488004

  • There is a positive relation between the coefficient assigned to the membership of each module and the percentage of SFARI genes that are assigned to that module
ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, SFARI_perc)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)])) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('% of SFARI genes in Module'))


  • There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.

This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)

ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, MTcor)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)])) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('Module-Diagnosis correlation'))




Analyse model


SFARI genes have a higher score distribution than the rest, but the overlap is large

plot_data = test_set %>% dplyr::select(prob, SFARI)

ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
         geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))
  • There seems to be a small but consistent positive relation between the SFARI scores and the probability assigned by the model
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            mutate(gene.score = ifelse(is.na(gene.score), ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                       gene.score)) %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  142
   2  205
   3  437
   Neuronal  930
   Others  14272
   #Total cases  15986
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

ggplotly(plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal())
comparisons = list(c('1','Neuronal'), c('2','Neuronal'), c('3','Neuronal'), c('1','Others'), c('2','Others'),
                   c('3','Others'), c('1','2'), c('2','3'), c('Neuronal', 'Others'))

plot_data %>% ggplot(aes(gene.score, prob, color = gene.score)) + 
              stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') + 
              stat_summary(fun = mean, geom = 'point', size = 3) + 
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), 
                                 label.y = c(.8, .75, .7, .95, .9, .85, rep(1, 3)), tip.length = 0.01) +
                                 #label.y = c(.5, .45, .4, .7, .65, .6, rep(0.8, 3)), tip.length = 0.01) +
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI Score') + ylab('Probability') +
              scale_colour_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              theme_minimal() + theme(legend.position = 'none')

rm(mean_vals)

Genes with highest scores in test set


  • Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:3)

  • 14/18 SFARI Genes in this list belong to the SFARI Score 1, 2 to Score 2 and only 1 to Score 3

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>%
             mutate(gene.score = ifelse(is.na(gene.score), ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                        gene.score)) %>%
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' = MTcor, 'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4), 
                    GeneSignificance = round(GeneSignificance,4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability, gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set')
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
TANC2 -0.2348 -0.5528 #FE61CF 0.8841 1
RPRD2 -0.0922 -0.5528 #FE61CF 0.8722 Others
MYCBP2 -0.3975 -0.4939 #8195FF 0.8676 Neuronal
ARID1B 0.2711 0.1127 #00A8FF 0.8643 1
KMT2A 0.1347 0.5941 #619CFF 0.8613 1
EP300 -0.0234 0.1127 #00A8FF 0.8576 1
MBD5 0.1943 0.0586 #B79F00 0.8533 1
ANK2 -0.4368 -0.9355 #00B9E3 0.8431 1
RFX7 0.1372 0.0586 #B79F00 0.8414 Others
RAPH1 -0.0534 -0.4939 #8195FF 0.8413 Others
SRRM4 -0.4400 -0.6877 #BF80FF 0.8339 Others
PRDM2 -0.2518 0.0586 #B79F00 0.8323 Others
KMT2D -0.3255 -0.5528 #FE61CF 0.8279 Others
TLN2 -0.4783 -0.7497 #FF66AA 0.8270 Others
NFAT5 0.1035 0.0586 #B79F00 0.8268 Others
C20orf112 0.0314 0.1127 #00A8FF 0.8268 Others
HIVEP2 0.0165 -0.9355 #00B9E3 0.8266 1
NBEA -0.4147 -0.9355 #00B9E3 0.8263 1
GRIN2B -0.2761 -0.7497 #FF66AA 0.8234 1
PLXNC1 -0.0088 -0.0094 #00BECF 0.8223 Others
ST8SIA3 -0.2000 -0.7497 #FF66AA 0.8200 Others
HUNK -0.3273 -0.5100 #F8766D 0.8199 Others
KIAA1109 -0.1173 -0.0094 #00BECF 0.8191 Others
GABBR2 -0.6249 -0.9355 #00B9E3 0.8181 3
SAP130 -0.2482 -0.5528 #FE61CF 0.8174 Others
ANK3 -0.4814 0.0586 #B79F00 0.8169 1
SRCAP -0.3623 -0.5528 #FE61CF 0.8168 1
CREBBP 0.1431 0.1127 #00A8FF 0.8166 1
KCNJ6 -0.1379 -0.9355 #00B9E3 0.8165 Others
NAV1 -0.1734 -0.5528 #FE61CF 0.8163 Neuronal
RNF111 -0.2410 0.0586 #B79F00 0.8156 Others
BAZ2A 0.0534 -0.0094 #00BECF 0.8132 Others
NF1 -0.4123 -0.4939 #8195FF 0.8112 1
BICD1 -0.0390 0.0586 #B79F00 0.8111 Others
GATAD2B -0.4221 -0.5528 #FE61CF 0.8102 Others
TLE3 0.4884 0.1127 #00A8FF 0.8098 Others
ARID1A -0.1378 -0.5528 #FE61CF 0.8095 Others
DAAM1 0.1579 -0.0094 #00BECF 0.8091 Others
POGZ 0.0391 0.1127 #00A8FF 0.8085 1
CDKL5 -0.3642 -0.9355 #00B9E3 0.8084 1
ASAP1 -0.0896 -0.6877 #BF80FF 0.8070 Others
HECTD4 -0.3170 -0.5528 #FE61CF 0.8070 1
DLGAP4 -0.3307 -0.5528 #FE61CF 0.8058 Others
NRXN3 -0.6716 -0.9355 #00B9E3 0.8056 1
RIMBP2 0.2615 -0.0094 #00BECF 0.8052 Others
EP400 -0.1671 -0.5528 #FE61CF 0.8048 2
RNF150 0.0009 -0.0094 #00BECF 0.8046 Others
GLTSCR1L -0.1624 0.0586 #B79F00 0.8045 Others
DROSHA -0.4111 -0.5528 #FE61CF 0.8043 Others
UBR4 -0.2856 -0.5528 #FE61CF 0.8040 Others




Negative samples distribution


Selecting the Negative samples in the test set

negative_set = test_set %>% filter(!SFARI)

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')
cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  12083
   TRUE  3119
   #Total cases  15202
cat(paste0('\n', sum(negative_set$pred), ' genes are predicted as ASD-related'))
## 
## 3119 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
                 ggtitle('Probability distribution of the Negative samples in the Test Set') + 
                 theme_minimal()


Probability and Gene Significance


  • There’s a lot of noise, but the probability the model assigns to each gene seems to have a negative relation with the Gene Significance (under-expressed genes having on average the higher probabilities and over-expressed genes the lowest)

  • The pattern is stronger in under-expressed genes

negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() + 
                 geom_smooth(method = 'loess', color = '#666666') +
                 geom_hline(yintercept = 0, color='gray', linetype='dashed') + xlab('Probability') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()


Probability and Module-Diagnosis correlation


  • There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model

  • The model seems to assign slightly higher probabilities to genes belonging the modules with negative module-Dianosis correlations than to genes belonging to modules with positive ones

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + 
                 geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()

Summarised version, plotting by module instead of by gene

The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

The model seems to give higher probabilities to genes belonging to modules with a small (absolute) correlation to Diagnosis (this is unexpected)

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% 
            left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())


Probability and level of expression


There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('Mean Expression') + 
              ylab('Probability') + ggtitle('Mean expression vs model probability by gene') +
              theme_minimal()

rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + 
         geom_point(color=plot_data2$Module, alpha = 0.6) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_smooth(method='lm', se=FALSE, color='gray', size = 0.5) + theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)


Probability and lfc


There is a relation between probability and lfc, so it IS capturing a bit of true information (because lfc and mean expression were negatively correlated and it still has a positive relation in the model)

  • The relation is stronger in genes under-expressed in ASD
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              theme_minimal() + ggtitle('LFC vs model probability by gene')

  • The relation is stronger in Differentially Expressed genes
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
                   geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + 
                   ylim(c(min(plot_data$prob), max(plot_data$prob))) + 
                   theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))

p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
                   geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
                   scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
                   theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())

grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')

rm(p1, p2)



Conclusion


The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)


Saving results

predictions = test_set

save(predictions, dataset, file='./../Data/Ridge_model_robust_new_SFARI.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] DMwR_0.4.1         expss_0.10.2       corrplot_0.84      MLmetrics_1.1.1   
##  [5] car_3.0-7          carData_3.0-3      ROCR_1.0-7         gplots_3.0.3      
##  [9] caret_6.0-86       lattice_0.20-41    biomaRt_2.40.5     ggpubr_0.2.5      
## [13] magrittr_1.5       RColorBrewer_1.1-2 gridExtra_2.3      viridis_0.5.1     
## [17] viridisLite_0.3.0  plotly_4.9.2       knitr_1.28         forcats_0.5.0     
## [21] stringr_1.4.0      dplyr_0.8.5        purrr_0.3.3        readr_1.3.1       
## [25] tidyr_1.0.2        tibble_3.0.0       ggplot2_3.3.0      tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.0.0            RSQLite_2.2.0              
##   [3] AnnotationDbi_1.46.1        htmlwidgets_1.5.1          
##   [5] BiocParallel_1.18.1         pROC_1.16.2                
##   [7] munsell_0.5.0               codetools_0.2-16           
##   [9] miniUI_0.1.1.1              withr_2.1.2                
##  [11] colorspace_1.4-1            Biobase_2.44.0             
##  [13] highr_0.8                   rstudioapi_0.11            
##  [15] stats4_3.6.3                ggsignif_0.6.0             
##  [17] TTR_0.23-6                  labeling_0.3               
##  [19] GenomeInfoDbData_1.2.1      bit64_0.9-7                
##  [21] farver_2.0.3                vctrs_0.2.4                
##  [23] generics_0.0.2              ipred_0.9-9                
##  [25] xfun_0.12                   R6_2.4.1                   
##  [27] GenomeInfoDb_1.20.0         locfit_1.5-9.4             
##  [29] bitops_1.0-6                DelayedArray_0.10.0        
##  [31] assertthat_0.2.1            promises_1.1.0             
##  [33] scales_1.1.0                nnet_7.3-14                
##  [35] ggExtra_0.9                 gtable_0.3.0               
##  [37] timeDate_3043.102           rlang_0.4.5                
##  [39] genefilter_1.66.0           splines_3.6.3              
##  [41] lazyeval_0.2.2              ModelMetrics_1.2.2.2       
##  [43] acepack_1.4.1               broom_0.5.5                
##  [45] checkmate_2.0.0             yaml_2.2.1                 
##  [47] reshape2_1.4.3              abind_1.4-5                
##  [49] modelr_0.1.6                crosstalk_1.1.0.1          
##  [51] backports_1.1.5             httpuv_1.5.2               
##  [53] quantmod_0.4.17             Hmisc_4.4-0                
##  [55] tools_3.6.3                 lava_1.6.7                 
##  [57] ellipsis_0.3.0              BiocGenerics_0.30.0        
##  [59] Rcpp_1.0.4.6                plyr_1.8.6                 
##  [61] base64enc_0.1-3             progress_1.2.2             
##  [63] zlibbioc_1.30.0             RCurl_1.98-1.1             
##  [65] prettyunits_1.1.1           rpart_4.1-15               
##  [67] S4Vectors_0.22.1            zoo_1.8-8                  
##  [69] SummarizedExperiment_1.14.1 haven_2.2.0                
##  [71] cluster_2.1.0               fs_1.4.0                   
##  [73] data.table_1.12.8           openxlsx_4.1.4             
##  [75] reprex_0.3.0                matrixStats_0.56.0         
##  [77] hms_0.5.3                   mime_0.9                   
##  [79] evaluate_0.14               xtable_1.8-4               
##  [81] XML_3.99-0.3                rio_0.5.16                 
##  [83] jpeg_0.1-8.1                readxl_1.3.1               
##  [85] IRanges_2.18.3              shape_1.4.4                
##  [87] compiler_3.6.3              KernSmooth_2.23-17         
##  [89] crayon_1.3.4                htmltools_0.4.0            
##  [91] mgcv_1.8-31                 later_1.0.0                
##  [93] Formula_1.2-3               geneplotter_1.62.0         
##  [95] lubridate_1.7.4             DBI_1.1.0                  
##  [97] dbplyr_1.4.2                MASS_7.3-51.6              
##  [99] Matrix_1.2-18               cli_2.0.2                  
## [101] gdata_2.18.0                parallel_3.6.3             
## [103] gower_0.2.1                 GenomicRanges_1.36.1       
## [105] pkgconfig_2.0.3             foreign_0.8-76             
## [107] recipes_0.1.10              xml2_1.2.5                 
## [109] foreach_1.5.0               annotate_1.62.0            
## [111] XVector_0.24.0              prodlim_2019.11.13         
## [113] rvest_0.3.5                 digest_0.6.25              
## [115] rmarkdown_2.1               cellranger_1.1.0           
## [117] htmlTable_1.13.3            curl_4.3                   
## [119] shiny_1.4.0.2               gtools_3.8.2               
## [121] lifecycle_0.2.0             nlme_3.1-147               
## [123] jsonlite_1.6.1              fansi_0.4.1                
## [125] pillar_1.4.3                fastmap_1.0.1              
## [127] httr_1.4.1                  survival_3.1-12            
## [129] glue_1.3.2                  xts_0.12-0                 
## [131] zip_2.0.4                   png_0.1-7                  
## [133] iterators_1.0.12            glmnet_3.0-2               
## [135] bit_1.1-15.2                class_7.3-17               
## [137] stringi_1.4.6               blob_1.2.1                 
## [139] DESeq2_1.24.0               latticeExtra_0.6-29        
## [141] caTools_1.18.0              memoise_1.1.0